This notebook contains notes and sketches created whilst exploring a particular committee report, the Women and Equalities Committee Gender pay gap inquiry report.
(From a cursory inspection of several other HTML published reports, there appears to be a significant amount of inconsistency in the way reports from different committees are presented online. A closer look at other reports, and the major differences that appear to arise across them, will be considered at a later date.)
In [2]:
url='https://publications.parliament.uk/pa/cm201516/cmselect/cmwomeq/584/58402.htm'
Observation - from the report contents page, I can navigate via the Back button to https://publications.parliament.uk/pa/cm201516/cmselect/cmwomeq/584/58401.htm but then it's not clear where I am at all?
It would probably make sense to be able to get back to the inquiry page for the inquiry that resulted in the report.
In [149]:
import pandas as pd
In [ ]:
import requests
import requests_cache
requests_cache.install_cache('parli_comm_cache')
from bs4 import BeautifulSoup
#https://www.dataquest.io/blog/web-scraping-tutorial-python/
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
In [23]:
#What does a ToC item look like?
soup.select('p[class*="ToC"]')[5].find('a')
Out[23]:
In [117]:
url_written=None
url_witnesses=None
for p in soup.select('p[class*="ToC"]'):
#witnesses
if 'Witnesses' in p.find('a'):
url_witnesses=p.find('a')['href']
#written evidence
if 'Published written evidence' in p.find('a'):
url_written=p.find('a')['href']
url_written, url_witnesses
Out[117]:
In [24]:
#https://stackoverflow.com/a/34661518/454773
pages=[]
for EachPart in soup.select('p[class*="ToC"]'):
href=EachPart.find('a')['href']
#Fudge to collect URLs of pages asssociated with report content
if '#_' in href:
pages.append(EachPart.find('a')['href'].split('#')[0])
pages=list(set(pages))
pages
Out[24]:
In [7]:
#We need to get the relative path for the page...
import os.path
stub=os.path.split(url)
stub
Out[7]:
In [25]:
#Grab all the pages in the report
for p in pages:
r=requests.get('{}/{}'.format(stub[0],p))
In [315]:
pagesoup=BeautifulSoup(r.content, 'html.parser')
print(str(pagesoup.select('div[id="shellcontent"]')[0])[:2000])
In [102]:
import re
def evidenceRef(pagesoup):
qs=[]
ws=[]
#Grab list of questions
for p in pagesoup.select('div[class="_idFootnote"]'):
#Find oral question numbers
q=re.search(r'^.*\s+(Q[0-9]*)\s*$', p.find('p').text)
if q:
qs.append(q.group(1))
#Find links to written evidence
links=p.find('p').findAll('a')
if len(links)>1:
if links[1]['href'].startswith('http://data.parliament.uk/WrittenEvidence/CommitteeEvidence.svc/EvidenceDocument/'):
ws.append(links[1].text.strip('()'))
return qs, ws
In [103]:
evidenceRef(pagesoup)
Out[103]:
In [104]:
qs=[]
ws=[]
for p in pages:
r=requests.get('{}/{}'.format(stub[0],p))
pagesoup=BeautifulSoup(r.content, 'html.parser')
pagesoup.select('div[id="shellcontent"]')[0]
qstmp,wstmp= evidenceRef(pagesoup)
qs += qstmp
ws +=wstmp
In [310]:
pd.DataFrame(qs)[0].value_counts().head()
Out[310]:
In [309]:
pd.DataFrame(ws)[0].value_counts().head()
Out[309]:
In [206]:
#url='https://publications.parliament.uk/pa/cm201516/cmselect/cmwomeq/584/58414.htm'
if url_witnesses is not None:
r=requests.get('{}/{}'.format(stub[0],url_witnesses))
pagesoup=BeautifulSoup(r.content, 'html.parser')
l1=[t.text.split('\t')[0] for t in pagesoup.select('h2[class="WitnessHeading"]')]
l2=pagesoup.select('table')
pd.DataFrame({'a':l1,'b':l2})
Out[206]:
In [308]:
#Just as easy to do this by hand
items=[]
items.append(['Tuesday 15 December 2015','Chris Giles', 'Economics Editor', 'The Financial Times','Q1', 'Q35'])
items.append(['Tuesday 15 December 2015','Dr Alison Parken', 'Women Adding Value to the Economy (WAVE)', 'Cardiff University','Q1', 'Q35'])
items.append(['Tuesday 15 December 2015','Professor Jill Rubery','', 'Manchester University','Q1', 'Q35'])
items.append(['Tuesday 15 December 2015','Sheila Wild', 'Founder', 'Equal Pay Portal','Q1', 'Q35'])
items.append(['Tuesday 15 December 2015','Professor the Baroness Wolf of Dulwich', "King's College", 'London','Q1', 'Q35'])
items.append(['Tuesday 15 December 2015','Neil Carberry', 'Director for Employment and Skills', 'CBI','Q36','Q58'])
items.append(['Tuesday 15 December 2015','Ann Francke', 'Chief Executive', 'Chartered Management Institute','Q36','Q58'])
items.append(['Tuesday 15 December 2015','Monika Queisser',' Senior Counsellor and Head of Social Policy', 'Organisation for Economic Cooperation and Development','Q36','Q58'])
items.append(['Tuesday 12 January 2016','Amanda Brown', 'Assistant General Secretary', 'NUT','Q59','Q99'])
items.append(['Tuesday 12 January 2016','Dr Sally Davies', 'President', "Medical Women's Federation",'Q59','Q99'])
items.append(['Tuesday 12 January 2016','Amanda Fone','Chief Executive Officer', 'F1 Recruitment and Search','Q59','Q99'])
items.append(['Tuesday 12 January 2016','Audrey Williams', 'Employment Lawyer and Partner',' Fox Williams','Q59','Q99'])
items.append(['Tuesday 12 January 2016','Anna Ritchie Allan', 'Project Manager', 'Close the Gap','Q100','Q136'])
items.append(['Tuesday 12 January 2016','Christopher Brooks', 'Policy Adviser', 'Age UK','Q100','Q136'])
items.append(['Tuesday 12 January 2016','Scarlet Harris', 'Head of Gender Equality', 'TUC','Q100','Q136'])
items.append(['Tuesday 12 January 2016','Mr Robert Stephenson-Padron', 'Managing Director', 'Penrose Care','Q100','Q136'])
items.append(['Tuesday 19 January 2016','Sarah Jackson', 'Chief Executive', 'Working Families','Q137','Q164'])
items.append(['Tuesday 19 January 2016','Adrienne Burgess', 'Joint Chief Executive and Head of Research', 'Fatherhood Institute','Q137','Q164'])
items.append(['Tuesday 19 January 2016','Maggie Stilwell', 'Partner', 'Ernst & Young LLP','Q137','Q164'])
items.append(['Tuesday 26 January 2016','Michael Newman', 'Vice-Chair', 'Discrimination Law Association','Q165','Q191'])
items.append(['Tuesday 26 January 2016','Duncan Brown', '','Institute for Employment Studies','Q165','Q191'])
items.append(['Tuesday 26 January 2016','Tim Thomas', 'Head of Employment and Skills', "EEF, the manufacturers' association",'Q165','Q191'])
items.append(['Tuesday 26 January 2016','Helen Fairfoul', 'Chief Executive', 'Universities and Colleges Employers Association','Q192','Q223'])
items.append(['Tuesday 26 January 2016','Emma Stewart', 'Joint Chief Executive Officer', 'Timewise Foundation','Q192','Q223'])
items.append(['Tuesday 26 January 2016','Claire Turner','', 'Joseph Rowntree Foundation','Q192','Q223'])
items.append(['Wednesday 10 February 2016','Rt Hon Nicky Morgan MP', 'Secretary of State for Education and Minister for Women and Equalities','Department for Education','Q224','Q296'])
items.append(['Wednesday 10 February 2016','Nick Boles MP', 'Minister for Skills', 'Department for Business, Innovation and Skills','Q224','Q296'])
df=pd.DataFrame(items,columns=['Date','Name','Role','Org','Qmin','Qmax'])
#Cleaning check
df['Org']=df['Org'].str.strip()
df['n_qmin']=df['Qmin'].str.strip('Q').astype(int)
df['n_qmax']=df['Qmax'].str.strip('Q').astype(int)
df['session']=df['Qmin']+'-'+df['n_qmax'].astype(str)
df.head()
Out[308]:
In [307]:
#url='https://publications.parliament.uk/pa/cm201516/cmselect/cmwomeq/584/58415.htm'
all_written=[]
if url_written is not None:
r=requests.get('{}/{}'.format(stub[0],url_written))
pagesoup=BeautifulSoup(r.content, 'html.parser')
for p in pagesoup.select('p[class="EvidenceList1"]'):
#print(p)
#Get rid of span tags
for match in p.findAll('span[class="EvidenceList1Span"]'):
match.extract()
all_written.append((p.contents[1].strip('()').strip(), p.find('a')['href'],p.find('a').text))
written_df=pd.DataFrame(all_written)
written_df.columns=['Org','URL','RefNumber']
written_df.head()
Out[307]:
In [266]:
def getSession(q):
return df[(df['n_qmin']<=q) & (df['n_qmax']>=q)].iloc[0]['session']
getSession(33)
Out[266]:
In [282]:
#Report on sessions that included a question by count
df_qs=pd.DataFrame(qs, columns=['qn'])
df_qs['session']=df_qs['qn'].apply(lambda x: getSession(int(x.strip('Q'))) )
s_qs_cnt=df_qs['session'].value_counts()
s_qs_cnt
Out[282]:
In [289]:
pd.concat([s_qs_cnt,df.groupby('session')['Org'].apply(lambda x: '; '.join(list(x)))],
axis=1).sort_values('session',ascending=False)
Out[289]:
In [306]:
#Written evidence
df_ws=pd.DataFrame(ws,columns=['RefNumber'])
df_ws=df_ws.merge(written_df, on='RefNumber')
df_ws['Org'].value_counts().head()
Out[306]:
In [305]:
#Organisations that gave written and witness evidence
set(df_ws['Org']).intersection(set(df['Org']))
#Note there are more matches that are hidden by dirty data
#- e.g. NUT and National Union of Teachers are presumably the same
#- e.g. F1 Recruitment and Search and F1 Recruitment Ltd are presumably the same
Out[305]:
In [ ]:
url='https://publications.parliament.uk/pa/cm201617/cmselect/cmwomeq/963/96302.htm'
In [ ]:
#Inconsistency across different reports in terms of presentation, linking to evidence